import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
from mpl_toolkits.mplot3d import Axes3D
import plotly.io as pio
pio.renderers.default = "notebook"
The data corresponds to house sales in Ames, Iowa. All data was collected from: https://www.kaggle.com/c/house-prices-advanced-regression-techniques/overview
# We indicate the file location
file_name_train = 'Data/train.csv'
file_name_test = 'Data/test.csv'
train_data = pd.read_csv(file_name_train)
test_data = pd.read_csv(file_name_test)
# We visualize first 100 training data samples (houses)
train_data.head(100)
We want to see the model in two dimensions. Thus, we are going to consider ONLY ONE feature (the area of the first floor) and the response variable (the price of the house)
# We load training data
x_train_pd = train_data['1stFlrSF']
y_train_pd = train_data['SalePrice']
x_train_list = x_train_pd.values.tolist()
y_train_list = y_train_pd.values.tolist()
x_train = np.array(x_train_list, dtype='float64')
x_train = x_train.reshape(x_train.shape[0], 1)
y_train = np.array(y_train_list, dtype='float64')
y_train = y_train.reshape(y_train.shape[0], 1)
# We load validation data
x_test_pd = test_data['1stFlrSF']
x_test_list = x_test_pd.values.tolist()
x_test = np.array(x_test_list, dtype='float64')
x_test = x_test.reshape(x_test.shape[0], 1)
print(x_train.shape, y_train.shape, x_test.shape)
We plot the data:
plt.figure()
plt.scatter(x_train, y_train, alpha = 1)
plt.title('Homes in Iowa')
plt.xlabel('Area of first floow [m2]')
plt.ylabel('Price [$]')
plt.show()
We want to consider a second feature: the surface area of the secon floor
x_train_pd_2 = train_data['2ndFlrSF']
x_train_list_2 = x_train_pd_2.values.tolist()
x_train_2 = np.array(x_train_list_2, dtype='float64').reshape(x_train.shape[0], 1)
# Plotting data
plt.figure()
plt.scatter(x_train_2, y_train, alpha = 1)
plt.title('Homes in Iowa')
plt.xlabel('Area of second floor [m2]')
plt.ylabel('Price [$]')
plt.show()
Graph in 3D (considering the two features: surface of first floor and surface of second floor)
import plotly.graph_objects as go
fig = go.Figure(data=[go.Scatter3d(x=list(x_train[:,0]),
y=list(x_train_2[:,0]),
z=list(y_train[:,0]),
marker=dict(
size=4,
colorscale='Viridis', # choose a colorscale
opacity=0.8
),
mode='markers')
])
fig.update_layout(
title={
'text': "<b>Homes in Iowa</b>",
'y':0.8,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'},
font=dict(
family="Arial, monospace",
size=12,
color="black"
),
scene = dict(
xaxis_title='<b>Area of 1st floor </b>',
yaxis_title='<b>Area of 2nd floor </b>')
)
fig.show()
The hypothesis for one feature (surface area of firts floor) is defined as:
$$h(x)=w^{T}x$$# Hypothesis function
def h(w, x):
"""
params:
w: [np_array] a vector of weights with dimensions (1xn), where n represents the number of weights.
x: [np_array] a vector of feature variables with dimensions (nxm),
where n represents the number of feature variables and m the number of training examples
returns:
h: [double] the estimation performed by the linear model h=w'*x
"""
return w.T.dot(x)
# We look at one training example
x0 = 1 # intersection
x1 = x_train[0][0]
x = np.array([x0, x1], dtype='float64')
x = x.reshape((x.shape[0], 1))
# We assign an ARBITRARY vector of parameters w
w0 = 3 # intersection
w1 = 100
w = np.array([w0, w1], dtype='float64')
w = w.reshape((w.shape[0], 1))
# We predict
pred = h(w, x)
print(f'Prediction: {pred[0][0]} USD')
print(f'Real label: {y_train[0][0]} USD')
We compute predictions for all our training samples (houses).
# We add a column of ones to consider the intersection
x_train_b = np.insert(x_train, 0, 1, axis=1)
x_train_b = x_train_b.T
# We define an ARBITRARY vector of paremeters (weights): [w0, w1]
w0 = 0 # intersection
w1 = 175
w = np.array([w0, w1], dtype='float64')
w = w.reshape((w.shape[0], 1))
preds = h(w, x_train_b)
print(f'Shape of the vector of predictions: {preds.shape}')
plt.figure()
plt.scatter(x_train, y_train, alpha = 1)
plt.plot(x_train, preds.T, color='r')
plt.title('Homes in Iowa')
plt.xlabel('Surface of the first floor [m2]')
plt.ylabel('Price [$]')
plt.show()
We define the cost function as the Mean Squared Error (MSE):
$$cost(w) = \frac{1}{2m}\sum_{i=1}^{m}(h(x)^{(i)}-y^{(i)})^{2}=\frac{1}{2m}(w^{T}X-\vec{y})^{T}(w^{T}X-\vec{y})$$# Cost function
def cost(w, x, y):
"""
params:
w: [np_array] a vector of weights with dimensions (nx1), where n represents the number of weights.
x: [np_array] a vector of feature variables with dimensions (nxm),
where n represents the number of feature variables and m the number of training examples
y: [np_array] a vector of feature variables with dimensions (mx1),
where m represents the number of target variables
returns:
cost: [double] the mean squared error
"""
return (1/(2*x.shape[1])) * (np.sum(np.square(h(w, x).T-y)))
We are going to plot the cost function for 300 x 300 possible pairs of parameters w0 and w1 to see how are cost function is defined by a convex shape (has a global minimum).
import plotly.graph_objects as go
num_data = 800
w0_v = np.arange(-num_data/2, num_data/2, dtype='float64')
w1_v = np.arange(-num_data/2, num_data/2, dtype='float64')
X, Y = np.meshgrid(w0_v, w1_v)
# We initialize our matrix of MSE
mse = np.zeros((num_data, num_data))
# We obtain the MSE for each possible combination of w0 and w1
for i in range(num_data):
for j in range(num_data):
w = np.array([w0_v[i], w1_v[j]])
w = w.reshape((w.shape[0], 1))
mse[i, j] = cost(w, x_train_b, y_train)
# We plot the cost function in 3D
fig = go.Figure(data=[go.Surface(z=mse, x=X, y=Y)])
fig.update_layout(
title={
'text': "<b>Cost function</b>",
'y':0.8,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'},
font=dict(
family="Arial, monospace",
size=12,
color="black"
),
width=500,
height=500,
margin=dict(l=65, r=50, b=65, t=90),
scene = dict(
xaxis_title='<b>Weight w0</b>',
yaxis_title='<b>Weight w1</b>')
)
fig.show()